In this notebook I import gene-level counts from snow crab RNA-Seq data. Trimmed RNASeq reads were aligned to the tanner crab genome using the STAR aligner, and counts (i.e. the number of reads aligning to each gene) were determined during alignment by STAR using the tanner annotation file. NOTE: before the alignment I converted the .gff annotation file (final_annotation.gff) to a .gtf file (final_annotation_LHS.gtf) for STAR.
list.of.packages <- c("tidyverse", "reshape2", "here", "plotly", "purrr", "janitor", "readxl", "clipr") #add new libraries here
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
# Load all libraries
lapply(list.of.packages, FUN = function(X) {
do.call("require", list(X))
})
NOTE: this code chunk uses the bash language, not R
rm ../results/star/countsfilenames.txt
for file in ../results/star/*.ReadsPerGene.out.tab
do
filename="$(echo $file)"
sample="$(basename -a $filename | cut -d "." -f 1)"
printf "%s\t%s\n" "$filename" "$sample" >> ../results/star/countsfilenames.txt
done
head -n 5 ../results/star/countsfilenames.txt
## ../results/star/1.ReadsPerGene.out.tab 1
## ../results/star/10.ReadsPerGene.out.tab 10
## ../results/star/11.ReadsPerGene.out.tab 11
## ../results/star/12.ReadsPerGene.out.tab 12
## ../results/star/13.ReadsPerGene.out.tab 13
# Create object from the countsfilenames.txt file
filenames <- read_delim(file="../results/star/countsfilenames.txt", delim = "\t", col_names = c("filename", "sample")) %>%
mutate(sampleID=paste("S", sample, sep=""))
## Rows: 63 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): filename
## dbl (1): sample
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
files <- file.path(filenames$filename) #extract vector of filenames
all(file.exists(files)) #easy code to check that all files exist!
## [1] TRUE
# Create sample.info object with annotation name for each sample/file, this will be used throughout the analysis
sample.info <- read_excel("../data/Crabs for RNA analysis.xlsx", sheet = "Sheet1",
skip = 8, col_names = c("sample", "pH", "duration"), na = "N/A") %>%
mutate_at(vars(pH, duration), as.factor) %>%
mutate(OA=as.factor(case_when(
pH=="pH 7.5" ~ "severe",
pH=="pH 7.8" ~ "moderate",
pH=="Ambient" ~ "ambient"))) %>%
mutate(treatment=as.factor(case_when(
(OA=="severe" & duration=="long") ~ "severe_long",
(OA=="severe" & duration=="short") ~ "severe_short",
(OA=="moderate" & duration=="long") ~ "moderate_long",
(OA=="moderate" & duration=="short") ~ "moderate_short",
TRUE ~ "ambient"))) %>%
left_join(filenames)
## Joining, by = "sample"
selecting only the first 2 columns (1=gene ID, 2=count); rename columns
file_list <- vector(mode = "list", length = nrow(filenames))
names(file_list) <- c(filenames$sampleID)
# Check out number of counts & number of genes for each sammple
for (i in 1:nrow(filenames)) {
file_list[[i]] <- data.frame(read.delim(file=files[i], header = F))[-1:-4,1:2]
names(file_list[[i]]) <- c("gene", filenames$sampleID[i])
print(paste("Total COUNTS,", names(file_list[[i]][2]), ":", prettyNum(sum(file_list[[i]][2]), big.mark=","), sep=" "))
print(paste("Total GENES,", names(file_list[[i]][2]), ":", prettyNum(nrow(file_list[[i]] %>% filter(.[[2]] != 0)), big.mark=","), sep=" "))
}
## [1] "Total COUNTS, S1 : 27,356,371"
## [1] "Total GENES, S1 : 15,813"
## [1] "Total COUNTS, S10 : 25,950,594"
## [1] "Total GENES, S10 : 16,458"
## [1] "Total COUNTS, S11 : 23,538,132"
## [1] "Total GENES, S11 : 16,122"
## [1] "Total COUNTS, S12 : 28,466,091"
## [1] "Total GENES, S12 : 15,936"
## [1] "Total COUNTS, S13 : 26,243,223"
## [1] "Total GENES, S13 : 15,608"
## [1] "Total COUNTS, S14 : 32,556,291"
## [1] "Total GENES, S14 : 16,257"
## [1] "Total COUNTS, S15 : 27,267,105"
## [1] "Total GENES, S15 : 16,201"
## [1] "Total COUNTS, S16 : 27,765,649"
## [1] "Total GENES, S16 : 16,008"
## [1] "Total COUNTS, S17 : 25,949,479"
## [1] "Total GENES, S17 : 16,866"
## [1] "Total COUNTS, S18 : 23,261,655"
## [1] "Total GENES, S18 : 15,837"
## [1] "Total COUNTS, S19 : 24,146,983"
## [1] "Total GENES, S19 : 14,964"
## [1] "Total COUNTS, S2 : 19,577,558"
## [1] "Total GENES, S2 : 16,169"
## [1] "Total COUNTS, S20 : 22,467,848"
## [1] "Total GENES, S20 : 15,491"
## [1] "Total COUNTS, S21 : 19,897,996"
## [1] "Total GENES, S21 : 14,782"
## [1] "Total COUNTS, S22 : 24,263,123"
## [1] "Total GENES, S22 : 15,418"
## [1] "Total COUNTS, S23 : 23,770,831"
## [1] "Total GENES, S23 : 15,369"
## [1] "Total COUNTS, S24 : 21,012,750"
## [1] "Total GENES, S24 : 15,959"
## [1] "Total COUNTS, S25 : 26,330,257"
## [1] "Total GENES, S25 : 15,783"
## [1] "Total COUNTS, S26 : 23,863,525"
## [1] "Total GENES, S26 : 16,665"
## [1] "Total COUNTS, S27 : 22,485,686"
## [1] "Total GENES, S27 : 15,065"
## [1] "Total COUNTS, S28 : 19,402,036"
## [1] "Total GENES, S28 : 15,178"
## [1] "Total COUNTS, S29 : 22,378,027"
## [1] "Total GENES, S29 : 15,167"
## [1] "Total COUNTS, S3 : 22,297,607"
## [1] "Total GENES, S3 : 15,569"
## [1] "Total COUNTS, S30 : 25,014,821"
## [1] "Total GENES, S30 : 14,932"
## [1] "Total COUNTS, S31 : 20,806,911"
## [1] "Total GENES, S31 : 15,612"
## [1] "Total COUNTS, S32 : 26,829,852"
## [1] "Total GENES, S32 : 16,496"
## [1] "Total COUNTS, S33 : 24,993,968"
## [1] "Total GENES, S33 : 15,019"
## [1] "Total COUNTS, S34 : 21,352,629"
## [1] "Total GENES, S34 : 14,913"
## [1] "Total COUNTS, S35 : 23,653,101"
## [1] "Total GENES, S35 : 15,108"
## [1] "Total COUNTS, S36 : 19,468,532"
## [1] "Total GENES, S36 : 14,173"
## [1] "Total COUNTS, S37 : 20,369,652"
## [1] "Total GENES, S37 : 15,415"
## [1] "Total COUNTS, S38 : 24,213,565"
## [1] "Total GENES, S38 : 15,817"
## [1] "Total COUNTS, S39 : 27,002,471"
## [1] "Total GENES, S39 : 15,327"
## [1] "Total COUNTS, S4 : 14,176,786"
## [1] "Total GENES, S4 : 14,240"
## [1] "Total COUNTS, S40 : 28,297,886"
## [1] "Total GENES, S40 : 15,910"
## [1] "Total COUNTS, S41 : 26,946,623"
## [1] "Total GENES, S41 : 16,173"
## [1] "Total COUNTS, S42 : 29,231,086"
## [1] "Total GENES, S42 : 15,712"
## [1] "Total COUNTS, S43 : 21,222,982"
## [1] "Total GENES, S43 : 15,818"
## [1] "Total COUNTS, S44 : 44,505,472"
## [1] "Total GENES, S44 : 17,626"
## [1] "Total COUNTS, S45 : 493,648"
## [1] "Total GENES, S45 : 7,947"
## [1] "Total COUNTS, S46 : 21,482,663"
## [1] "Total GENES, S46 : 16,511"
## [1] "Total COUNTS, S47 : 25,423,628"
## [1] "Total GENES, S47 : 16,491"
## [1] "Total COUNTS, S48 : 24,483,637"
## [1] "Total GENES, S48 : 15,963"
## [1] "Total COUNTS, S49 : 23,676,454"
## [1] "Total GENES, S49 : 15,556"
## [1] "Total COUNTS, S5 : 21,763,794"
## [1] "Total GENES, S5 : 15,299"
## [1] "Total COUNTS, S50 : 25,478,463"
## [1] "Total GENES, S50 : 15,830"
## [1] "Total COUNTS, S51 : 27,365,340"
## [1] "Total GENES, S51 : 16,569"
## [1] "Total COUNTS, S52 : 23,270,277"
## [1] "Total GENES, S52 : 16,432"
## [1] "Total COUNTS, S53 : 32,346,325"
## [1] "Total GENES, S53 : 16,699"
## [1] "Total COUNTS, S54 : 36,722,531"
## [1] "Total GENES, S54 : 16,760"
## [1] "Total COUNTS, S55 : 28,187,412"
## [1] "Total GENES, S55 : 16,235"
## [1] "Total COUNTS, S56 : 33,181,676"
## [1] "Total GENES, S56 : 16,715"
## [1] "Total COUNTS, S57 : 25,071,047"
## [1] "Total GENES, S57 : 15,657"
## [1] "Total COUNTS, S58 : 26,296,226"
## [1] "Total GENES, S58 : 16,251"
## [1] "Total COUNTS, S59 : 23,712,683"
## [1] "Total GENES, S59 : 15,277"
## [1] "Total COUNTS, S6 : 20,900,574"
## [1] "Total GENES, S6 : 14,864"
## [1] "Total COUNTS, S60 : 25,561,202"
## [1] "Total GENES, S60 : 15,847"
## [1] "Total COUNTS, S61 : 32,684,320"
## [1] "Total GENES, S61 : 16,560"
## [1] "Total COUNTS, S62 : 45,169,493"
## [1] "Total GENES, S62 : 16,804"
## [1] "Total COUNTS, S63 : 23,263,690"
## [1] "Total GENES, S63 : 15,894"
## [1] "Total COUNTS, S7 : 16,502,090"
## [1] "Total GENES, S7 : 13,939"
## [1] "Total COUNTS, S8 : 25,240,705"
## [1] "Total GENES, S8 : 16,127"
## [1] "Total COUNTS, S9 : 25,121,288"
## [1] "Total GENES, S9 : 16,150"
file_list[[1]] %>% head(n=20)
## gene S1
## 5 gene_21170 0
## 6 gene_21172 0
## 7 gene_20492 0
## 8 gene_20493 2
## 9 gene_22758 0
## 10 gene_22759 0
## 11 gene_22760 114
## 12 gene_22756 0
## 13 gene_22761 7
## 14 gene_22757 1
## 15 gene_22762 2
## 16 gene_22083 0
## 17 gene_22101 0
## 18 gene_22102 0
## 19 gene_22103 2
## 20 gene_22104 0
## 21 gene_22105 0
## 22 gene_22084 0
## 23 gene_22106 1
## 24 gene_22085 98
counts <- file_list %>% purrr::reduce(full_join, by = "gene") %>% column_to_rownames(var="gene")
head(counts) #resulting dataframe has genes by row, and samples by column
## S1 S10 S11 S12 S13 S14 S15 S16 S17 S18 S19 S2 S20 S21 S22 S23 S24
## gene_21170 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## gene_21172 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## gene_20492 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## gene_20493 2 0 1 0 0 1 0 0 0 0 0 0 0 0 0 3 0
## gene_22758 0 15 2 3 1 4 5 0 7 2 0 7 3 0 0 3 3
## gene_22759 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## S25 S26 S27 S28 S29 S3 S30 S31 S32 S33 S34 S35 S36 S37 S38 S39 S4
## gene_21170 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## gene_21172 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## gene_20492 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## gene_20493 5 1 0 0 0 0 1 2 0 3 2 0 0 0 0 0 0
## gene_22758 2 0 7 4 1 7 1 0 2 0 2 7 0 5 0 4 16
## gene_22759 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## S40 S41 S42 S43 S44 S45 S46 S47 S48 S49 S5 S50 S51 S52 S53 S54 S55
## gene_21170 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## gene_21172 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## gene_20492 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
## gene_20493 0 0 1 0 3 0 1 0 0 0 2 0 1 0 0 0 0
## gene_22758 0 3 2 5 11 0 3 0 0 4 1 6 0 5 1 3 3
## gene_22759 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## S56 S57 S58 S59 S6 S60 S61 S62 S63 S7 S8 S9
## gene_21170 0 0 0 0 0 0 0 0 0 0 0 0
## gene_21172 0 0 0 0 0 0 0 0 0 0 0 0
## gene_20492 0 0 0 0 0 0 0 0 0 0 0 0
## gene_20493 0 0 1 1 1 0 0 2 0 0 0 0
## gene_22758 1 4 0 5 0 3 7 6 7 5 0 5
## gene_22759 0 0 0 0 0 0 0 0 0 0 0 0
print(paste("Number of samples:", ncol(counts), sep=" "))
## [1] "Number of samples: 63"
print(paste("Total number of genes in dataframe:", prettyNum(nrow(counts), big.mark = ","), sep=" "))
## [1] "Total number of genes in dataframe: 47,451"
print(paste("Average number of genes per sample:", prettyNum(mean(colSums(counts != 0)), big.mark = ","), sep=" "))
## [1] "Average number of genes per sample: 15,672.27"
print(paste("Total counts, all samples:", prettyNum(sum(colSums(counts)), big.mark = ","), sep=" "))
## [1] "Total counts, all samples: 1,571,734,320"
#print(paste("Counts for", colnames(counts %>% select(contains("Tank"))), ":", prettyNum(colSums(counts %>% select(contains("Tank"))), big.mark = ","), sep=" "))
#inspect total counts by sample - this generates an interactive plot, hover over bars to identify sample
ggplotly(
ggplot(data.frame(colSums(counts)) %>%
dplyr::rename(count.total = 1) %>% rownames_to_column(var="sample")) +
geom_bar(aes(x=sample, y=count.total), stat = "identity") + ggtitle("Total count by sample") +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()))
Sample 45 has very few counts, remove
remove.list <- c("S45")
counts <- counts[ , -which(names(counts) %in% remove.list)]
sample.info <- sample.info[ -which(sample.info$sampleID %in% remove.list), ]
# resave sample info object
save(sample.info, file="../data/sample.info")
nrow(sample.info) == ncol(counts) #should = TRUE if sample.info and gene count matrix countain the same samples
## [1] TRUE
counts.t <- as.data.frame(t(counts))
Here we remove genes (rows) that are expressed at very low levels. These are not likely to be biologically relevant and by reducing the number of genes to be assessed for differential expression we reduce the impact of the multiple-comparison correction. The definition of “low-frequency genes” can be adjusted - there’s no hard and fast rule - but this is what I have been using: “Genes with mean count <10 across all samples or those with counts <30 across at minimum 10% of the samples were discarded.” You’ll see that while many genes were discarded (~36,000 were dropped, out of ~47,500), the vast majority of fragments were retained (99.955%)! This is because those 36k genes only had a few reads aligning to them.
keep1 <- colMeans(counts.t, na.rm=TRUE) >= 10 #identify genes with mean count >= 10 across all samples (excluding NAs = 10)
keep2 <- rowSums( counts >= 10 ) >= 0.1*43 #identify genes with counts>=10 across at minimum 10% of the samples
keep <- unique(c(names(which(keep1 == T)), names(which(keep2 == T)))) # list of genes meeting one of the two above criteria
counts.ts <- counts.t[,keep]
# Print summary of gene filtering.
print(paste("# genes before filtering:", ncol(counts.t)))
## [1] "# genes before filtering: 47451"
print(paste("# genes remaining after pre-filtering:", ncol(counts.ts)))
## [1] "# genes remaining after pre-filtering: 11491"
print(paste("# of genes dropped:", ncol(counts.t) - ncol(counts.ts), sep=" "))
## [1] "# of genes dropped: 35960"
print(paste("% of fragments remaining after pre-filtering: ", signif(100*sum(counts.ts, na.rm = T)/sum(counts.t, na.rm = T), digits = 5), "%", sep=""))
## [1] "% of fragments remaining after pre-filtering: 99.955%"
print(paste("Number of fragments dropped: ", signif(sum(counts.t, na.rm = T)-sum(counts.ts, na.rm = T), digits = 5)))
## [1] "Number of fragments dropped: 712310"
print(paste("% of fragments dropped: ", signif(100*(sum(counts.t, na.rm = T)-sum(counts.ts, na.rm = T))/sum(counts.t, na.rm = T), digits = 5), "%", sep=""))
## [1] "% of fragments dropped: 0.045334%"
print(paste("Number of fragments remaining: ", signif(sum(counts.ts, na.rm = T), digits = 5)))
## [1] "Number of fragments remaining: 1570500000"
save(counts, file = "../results/counts")
save(counts.t, file = "../results/counts.t")
save(counts.ts, file = "../results/counts.ts")
Many of the enrichment analyses I run require Uniprot IDs for each gene. Not all annotation files for genomes (or transcriptomes) have Uniprot ID’s associated with each gene. So, to get these ID’s I run a program called Blast to match the sequences of known genes in the reference genome to sequences in the Uniprot/Swissprot database. Blast is run on Sedna or Mox and takes a while (hours to days, depending on the number/size of gene sequences). Before doing that, though, I need to create a .bed file which identifies the location of each gene in the genome. Once I create a bed file listing genes, I will use that with getfasta from bedtools (another bash language program) to extract gene sequences, which I will then use with blast to identify gene function
NOTE: GFF format use a 1-based coordinate system, while BED format uses a 0-based coordinate system. I therefore need to convert my coordinates. Check out this helpful coordinate system summary.
read.delim(file = "../references/final_annotation.gff", sep = "", header = F) %>%
mutate(V3=as.factor(V3)) %>% filter(V3=="gene") %>%
select(V1, V4, V5, V9) %>%
mutate(V4=as.numeric(V4), V5=as.numeric(V5)) %>%
mutate(V4=V4-1) %>% #convert from 1-based to 0-based by subtracting 1 from the start position
mutate(ID=str_extract(V9, "ID=(.*?);")) %>%
mutate(ID=str_remove(ID, ";")) %>% mutate(ID=str_remove(ID, "ID=")) %>%
select(V1,V4,V5,ID) %>%
write.table(., "../references/tanner_genes.bed", sep = "\t",
col.names = F, row.names = F, quote = F)
This has 4 columns: 1. Chromosome/contig ID (on tanner genome), 2. Gene start locus 3. Gene end locus, 4. Gene ID
head ../references/tanner_genes.bed
## ptg000001l 307959 309359 gene_21170
## ptg000001l 308964 309738 gene_21172
## ptg000002l 40323 41936 gene_20492
## ptg000002l 42740 43499 gene_20493
## ptg000003l 6099 7875 gene_22758
## ptg000003l 8823 9462 gene_22759
## ptg000003l 28603 41091 gene_22760
## ptg000003l 48542 50825 gene_22756
## ptg000003l 61793 67795 gene_22761
## ptg000003l 64361 65873 gene_22757
blast on Sedna/Mox to annotate genes.Used the script blast-tanner.sh, which needs the following inputs:
1. The most current Uniprot/Swissprot database, which can be downloaded from https://www.uniprot.org/help/downloads. Select the “Reviewed (Swiss-Prot)” in fasta format.
2. The genome fasta file that was used for aligning reads. E.g. the tanner fasta was tanner.asm.hic.p_ctg.fa. Before blasting, the script uses bedtools getfasta to grab the sequences for each gene from the genome. It’s critical that the annotation file (.gff) that was used in the previous step to generate the .bed file is associated with the same genome file!
3. The programs blast and bedtools need to be installed and loaded (the above script shows how they can be loaded on Mox)
Once the blast results have finished, proceed to next step!
NOTE: To run the below code you’ll need the file tanner_genes_blastx.tab. It’s very large, so grab it from Google Drive and save to the snowcrab/references/ directory.
blastx output format 6 is tab file with the following columns:
1. qaccver = Query accesion.version
2. saccver = Subject accession.version
3. pident = Percentage of identical matches
4. length = Alignment length
5. mismatch = Number of mismatches
6. gapopen = Number of gap openings
7. qstart = Start of alignment in query
8. qend = End of alignment in query
9. sstart = Start of alignment in subject
10. send = End of alignment in subject
11. evalue = Expect value
12. bitscore = Bit score
tanner.blast <-
read_delim(file = "../references/tanner_genes_blastx.tab", delim = "\t",
col_names = c("qaccver", "saccver", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore")) %>%
separate(qaccver, sep = ":|-", into = c("SeqID","Start","End")) %>%
mutate_at(vars(Start, End), as.numeric) %>%
separate(saccver, sep="\\|", into=c("na", "SPID", "gene.Uni"), remove = F) %>%
dplyr::select(-na) %>% separate(gene.Uni, sep="_", into=c("gene.Uni", "species"), remove=T) %>%
group_by(SeqID, Start, End) %>% dplyr::slice(which.min(evalue)) %>% # where multiple blast hits for same gene, select one with minimum e-value
left_join(., read_delim(file="../references/tanner_genes.bed", delim="\t", col_names = c("SeqID", "Start", "End", "GeneID"))) %>%
dplyr::select(GeneID, everything()) %>%
clean_names()
## Rows: 2370863 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): qaccver, saccver
## dbl (10): pident, length, mismatch, gapopen, qstart, qend, sstart, send, eva...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 47451 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): SeqID, GeneID
## dbl (2): Start, End
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Joining, by = c("SeqID", "Start", "End")
# E-VALUE FILTERING
# Count annotated genes with e-value filters
tanner.blast %>% filter(evalue < 1.0e-10) %>% nrow() # 12,993 genes have e-value <1e-10
## [1] 12993
tanner.blast %>% filter(evalue < 1.0e-15) %>% nrow() # 9,979 genes have e-value <1e-15
## [1] 9979
tanner.blast %>% filter(evalue < 1.0e-20) %>% nrow() # 7,252 genes have e-value <1e-20
## [1] 7252
# Filter blast results to meet desired e-value threshold - here we choose 1e-10
tanner.blast <- tanner.blast %>% filter(evalue < 1.0e-10)
save(tanner.blast, file = "../references/tanner.blast")
load(file = "../references/tanner.blast")
NOTE: I needed GO IDs and gene functions, which weren’t included in the blast file. So I copied the column containing all Uniprot IDs from the blast object, then pasted those into the tool Uniprot batch retrieval tool (https://www.uniprot.org/uploadlists/), and selected the columns: Entry, Entry name, Protein names, Gene names, Organism, Gene ontology (biological process), Gene ontology (cellular componenet), Gene ontology (molecular processes), Gene ontology (GO), Gene ontology Ids. I then downloaded all entries to a tab file, saved as: /references/tanner_genes_GO.tsv. Now I’ll read that into R and join with the tanner.blast dataframe to link GO IDs with genes for exploration and enrichment analyses.
# Use this code to get list of Uniprot SPID (gene identifiers) to input into the Uniprot Batch Retrieval tool, to obtain GO terms for each gene
tanner.blast %>% ungroup() %>% dplyr::select(spid) %>% distinct() %>%
na.omit() %>% unlist() %>% as.vector() %>% write_clip(allow_non_interactive = TRUE)
# After downloading the tanner_genes_GO.tsv file, add GO terms to Uniprot annotation object
tanner.blast.GO <- left_join(tanner.blast, read_delim(file = "../references/tanner_genes_GO.tsv",
delim = "\t") %>% clean_names(),
by = c("spid"="entry")) %>% ungroup()
## Rows: 3776 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (11): From, Entry, Entry Name, Protein names, Gene Names, Organism, Gene...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
save(tanner.blast.GO, file = "../references/tanner.blast.GO")
#load(file = "../references/tanner.blast.GO")
# Preview annotated list of genes with their Gene Ontology information, which describes the various functions performed by each gene
head(tanner.blast.GO)
## # A tibble: 6 × 28
## gene_id seq_id start end saccver spid gene_uni species pident length
## <chr> <chr> <dbl> <dbl> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 gene_22760 ptg0000… 28603 41091 sp|Q9D… Q9D9… F186A MOUSE 29.5 370
## 2 gene_22762 ptg0000… 67966 70416 sp|Q9U… Q9UL… ANR50 HUMAN 27.5 236
## 3 gene_22083 ptg0000… 16091 16581 sp|Q8I… Q8IZ… F200C HUMAN 33.3 93
## 4 gene_22106 ptg0000… 156430 157971 sp|O75… O755… JERKY HUMAN 41.1 331
## 5 gene_22085 ptg0000… 170352 178379 sp|Q6P… Q6P6… DDB1 XENLA 83.3 78
## 6 gene_22090 ptg0000… 254082 260100 sp|Q3U… Q3U1… DDB1 MOUSE 72.4 58
## # … with 18 more variables: mismatch <dbl>, gapopen <dbl>, qstart <dbl>,
## # qend <dbl>, sstart <dbl>, send <dbl>, evalue <dbl>, bitscore <dbl>,
## # from <chr>, entry_name <chr>, protein_names <chr>, gene_names <chr>,
## # organism <chr>, gene_ontology_biological_process <chr>,
## # gene_ontology_cellular_component <chr>, gene_ontology_go <chr>,
## # gene_ontology_molecular_function <chr>, gene_ontology_i_ds <chr>
Add gene name and functional information to each gene in our snow crab dataset!
counts.annot.tanner <- right_join(
tanner.blast %>% ungroup() %>% dplyr::select(gene_id, spid, gene_uni, species, pident, evalue, gene_id, seq_id, start, end, gene_id),
counts.ts %>% t() %>% as.data.frame() %>% rownames_to_column("gene_id"), "gene_id") %>%
left_join(tanner.blast.GO %>% dplyr::select(gene_id, protein_names, gene_names), "gene_id")
save(counts.annot.tanner, file = "../results/counts.annot.tanner")
#load(file = "../results/counts.annot.tanner")
write.csv(counts.annot.tanner, file = "../results/counts.annot.tanner.csv", row.names = F, quote = F) #save annotated gene information to .csv file
right_join(
tanner.blast %>% ungroup() %>% dplyr::select(gene_id, spid, gene_uni, species, pident, evalue),
counts.ts %>% t() %>% as.data.frame() %>% rownames_to_column(var = "gene_id"), "gene_id") %>% filter(!is.na(spid)) %>%
nrow()
## [1] 5808
read_delim(file = "../references/final_annotation.gff", delim = "\t", col_names = F, skip = 8) %>%
mutate(X3=as.factor(X3)) %>% filter(X3=="gene") %>% nrow()
## Warning: One or more parsing issues, see `problems()` for details
## Rows: 257189 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (7): X1, X2, X3, X6, X7, X8, X9
## dbl (2): X4, X5
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## [1] 47449